/*
	This program is free software; you can redistribute it and/or modify
	it under the terms of the GNU General Public License version 2 
	as published by the Free Software Foundation.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA


	Copyright (C) 2006  Thierry Berger-Perrin <tbptbp@gmail.com>
*/

#include "specifics.h"
#include "rt_render.h"

#include "math_constants.h"
#include "math_vec.h"
#include "math_aabb.h"

#include <memory>

#include "sse.h"
#include "sse_approx.h"

// a packed scalar in disguise
union MM_ALIGN16 sse_t {
	__m128		ps;
	__m128i		pi;
	float32_t	f[4];
	int32_t		i[4];
	struct { float32_t	f0,f1,f2,f3; };
	struct { int32_t	i0,i1,i2,i3; };

	sse_t() {}
	sse_t(const __m128 v) : ps(v) {}
	explicit sse_t(const float a, const float b, const float c, const float d) : f0(a),f1(b),f2(c),f3(d) {}



	sse_t operator *(const float rhs) const { return mulps(ps, set1ps(rhs)); }
	sse_t operator +(const float rhs) const { return addps(ps, set1ps(rhs)); }

	/* no need for a kludge
	sse_t & operator =(const sse_t rhs) */

	// bad codegen ensues
	//operator const __m128 &() const { return ps; }
	//operator const __m128 () const { return ps; }

	/*
	template <bool pick0, bool pick1, bool pick2, bool pick3> sse_t select() const {
		enum { mask_idx = pick0 | pick1 << 1 | pick2 << 2 | pick3 << 3 };
		return andps(cst::section.selection_multiple[mask_idx].ps, ps);
	}
	*/

	// returns a mask with selected scalar components.
	template <bool pick0, bool pick1, bool pick2, bool pick3> static sse_t select() {
		enum { mask_idx = pick0 | pick1 << 1 | pick2 << 2 | pick3 << 3 };
		return cst::section.selection_multiple[mask_idx].ps;
	}
};

// a quad vec or 4x3-component vectors in mostly SoA form.
struct MM_ALIGN16 qvec_t {
	// have to pick one notation, either named or subscripted: since sse_t is non-pod we can't have an union.
	// there's more hinderance with subscripting and we can hack around indexing so...
	sse_t x,y,z;

	qvec_t() {}
	//qvec_t(const __m128 &v) : x(v) {}
	//qvec_t(const __m128 &a, const __m128 &b, const __m128 &c) : x(a),y(b),z(c) {}
	FINLINE qvec_t(const __m128 a, const __m128 b, const __m128 c) : x(a),y(b),z(c) {}
	//qvec_t(const qvec_t &q) : x(q.x.ps), y(q.y.ps), z(q.z.ps)  {}
	//qvec_t(const sse_t &a, const sse_t &b, const sse_t &c) : x(a.ps),y(b.ps),z(c.ps) {}

	FINLINE qvec_t(const qvec_t &q) : x(q.x.ps), y(q.y.ps), z(q.z.ps)  {}

	//const sse_t &operator [](const uint_t axis) const { return *((const sse_t * __restrict const)this+axis); }
	const __m128 &operator [](const uint_t axis) const { return ((const sse_t * __restrict const)this+axis)->ps; }

	static FINLINE qvec_t from(const vec_t &v) { return qvec_t(set1ps(v.x), set1ps(v.y), set1ps(v.z)); }

	// gcc goes nuts without such heavy handed hint.
	#if defined __ICC__ || defined __GCC__
		qvec_t & operator =(const qvec_t &q) {
			x.ps = q.x.ps; y.ps = q.y.ps; z.ps = q.z.ps;
			return *this;
		}
	#endif

	
	FINLINE qvec_t rcp() const { return qvec_t(divps(cst::section.one.ps, x.ps), divps(cst::section.one.ps, y.ps), divps(cst::section.one.ps, z.ps)); }
	#if 0
		FINLINE qvec_t fast_rcp() const { return qvec_t(rcpps(x.ps), rcpps(y.ps), rcpps(z.ps)); }
	#else
		FINLINE qvec_t fast_rcp() const { return qvec_t(sse::rcp_nr_robust_ps(x.ps), sse::rcp_nr_robust_ps(y.ps), sse::rcp_nr_robust_ps(z.ps)); }
	#endif

	/*
	qvec_t operator +(const qvec_t &rhs) const { return qvec_t(addps(x.ps, rhs.x.ps), addps(y.ps, rhs.y.ps), addps(z.ps, rhs.z.ps)); }
	qvec_t &operator +=(const qvec_t &rhs) { x.ps = addps(x.ps, rhs.x.ps); y.ps = addps(y.ps, rhs.y.ps); z.ps = addps(z.ps, rhs.z.ps); return *this; }

	qvec_t operator *(const float rhs) const { const __m128 v(set1ps(rhs)); return qvec_t(mulps(v, x.ps), mulps(v, y.ps), mulps(v, z.ps)); }
	qvec_t operator *(const sse_t &rhs) const {  return qvec_t(mulps(x.ps, rhs.ps), mulps(y.ps, rhs.ps), mulps(z.ps, rhs.ps)); }
	qvec_t operator *(const qvec_t &rhs) const { return qvec_t(mulps(x.ps, rhs.x.ps), mulps(y.ps, rhs.y.ps), mulps(z.ps, rhs.z.ps)); }
	qvec_t &operator *=(const qvec_t &rhs) { x.ps = mulps(x.ps, rhs.x.ps); y.ps = mulps(y.ps, rhs.y.ps); z.ps = mulps(z.ps, rhs.z.ps); return *this; }

	qvec_t operator &(const sse_t rhs) const { return qvec_t(andps(x.ps, rhs.ps), andps(y.ps, rhs.ps), andps(z.ps, rhs.ps)); }
	qvec_t operator |(const sse_t rhs) const { return qvec_t(orps(x.ps, rhs.ps), orps(y.ps, rhs.ps), orps(z.ps, rhs.ps)); }
	qvec_t operator ^(const sse_t rhs) const { return qvec_t(xorps(x.ps, rhs.ps), xorps(y.ps, rhs.ps), xorps(z.ps, rhs.ps)); }




	template <bool pick0, bool pick1, bool pick2, bool pick3> qvec_t select() const {
		enum { mask_idx = pick0 | pick1 << 1 | pick2 << 2 | pick3 << 3 };
		return *this & cst::section.selection_multiple[mask_idx].ps;
	}
	*/

};

// returns the index of the least significant 1-bit of x
static FINLINE int32_t bit_scan_forward32(const uint32_t x) {
	#if defined __GCC__ || defined __ICC_GCC__
		return __builtin_ffs(x) - 1;
	#elif defined __MSVC__
		int32_t idx;
		_BitScanForward((DWORD*)&idx, x);
		return idx;
	#elif defined __ICC__
		return _bit_scan_forward(x);
		/*
		// pfff.
		#pragma nounroll
		for (uint_t i=0; i<32; ++i)
			if (x & (1<<i)) return i;
		return 0;
		*/
	#endif
}
namespace rt {
	/*
		Structures & helpers for single ray packets, that is with a direct mapping to SSE packed scalars (4x1)
	*/
	namespace packet {
		struct hit_t {
			sse_t
				t,
				u,v,
				id;	// <-- those are really packed integers.
			
			hit_t() : t(cst::section.plus_inf.ps), u(all_zero()), v(all_zero()), id(bits_all_set()) {}

			void reset() { *this = hit_t(); }

			// all id != ~0u
			FINLINE bool_t is_not_totally_missed() const { return movemask(castpi2ps(_mm_cmpgt_epi32(id.pi, bits_all_set()))); }
		};

		struct ray_segment_t {
			sse_t t_near, t_far;
		};

		struct ray_t {
			qvec_t pos, rcp_dir, dir;
			uint_t	signs[4];

			explicit ray_t(const qvec_t &org) : pos(org) {}
			//explicit ray_t(const vec_t &org) : 

			// lies, damn lies!
			FINLINE void set_dir(const qvec_t &d) const {
				(qvec_t &) dir = d;
				// (qvec_t &) rcp_dir = d.rcp();
				(qvec_t &) rcp_dir = d.fast_rcp();
				// (qvec_t &) rcp_dir = qvec_t(sse::rcp_nr_robust_ps(d.x.ps), sse::rcp_nr_robust_ps(d.y.ps), sse::rcp_nr_robust_ps(d.z.ps));
			}
			FINLINE void set_signs(const bool_t s0, const bool_t s1, const bool_t s2) const {
				(bool_t &) signs[0] = s0;
				(bool_t &) signs[1] = s1;
				(bool_t &) signs[2] = s2;
				(bool_t &) signs[3] = false;
			}
		};


		struct trace_stack_t {
			enum { max_lvl = 32 /*64*/ };
			ray_segment_t								segments[max_lvl];
			const kdtree::node_t		* __restrict	nodes[max_lvl];
			int_t idx;

			trace_stack_t() : idx(0) {}

			void reset() { idx = 0; }
		};

#if 1
		static FINLINE bool_t intersect_ray_box_robust(const aabb_t &box, const ray_t &ray, ray_segment_t &rs);
		static FINLINE void intersect_ray_tri(const wald_tri_t &tri, const ray_t &ray, hit_t &hit, const __m128 inactive);
		static FINLINE void intersect_all(const scene_t &scene, const int list, const int count, const ray_t &ray, hit_t &hit, const __m128 inactive);
		static FINLINE void trace_segment(const scene_t &scene, const ray_t &ray, hit_t &hit, ray_segment_t &rs, trace_stack_t &stack);
		static FINLINE void trace(const scene_t &scene, const ray_t &ray, hit_t &hit, trace_stack_t &stack);
#else
		static NOINLINE bool_t intersect_ray_box_robust(const aabb_t &box, const ray_t &ray, ray_segment_t &rs);
		static FINLINE void intersect_ray_tri(const wald_tri_t &tri, const ray_t &ray, hit_t &hit, const __m128 inactive);
		static FINLINE void intersect_all(const scene_t &scene, const int list, const int count, const ray_t &ray, hit_t &hit, const __m128 inactive);
		static NOINLINE void trace_segment(const scene_t &scene, const ray_t &ray, hit_t &hit, ray_segment_t &rs, trace_stack_t &stack);
		static NOINLINE void trace(const scene_t &scene, const ray_t &ray, hit_t &hit, trace_stack_t &stack);
#endif

		static FINLINE __m128 compute_color_scale(const vec_t extent) {
			#if 0
				// icc ICE on that one, hahha.
				const float scale_depth = extent.get_max() * 0.25f;
				const __m128 color_scale = { 1.f*scale_depth, 0.5f*scale_depth, 0.25f*scale_depth, 0*scale_depth };
				return color_scale;
			#else
				const __m128 depth = set1ps(extent.get_max() * 0.25f);
				const __m128 color_scale = { 1.f, 0.5f, 0.25f, 0 };
				return mulps(depth, color_scale);
			#endif
		}

		static FINLINE void do_render_tile(const renderer_t &renderer, const point_t &corner) {
			// BREAKPOINT();
			enum { packet_size_x = 2, packet_size_y = packet_size_x };


			trace_stack_t trace_stack;

			const qvec_t 
				sampler_dx(qvec_t::from(renderer.rt.camera.sampler.dx)), sampler_dy(qvec_t::from(renderer.rt.camera.sampler.dy)),
				// tile scanning increments
				scan_dx(qvec_t::from(renderer.rt.camera.sampler.dx*packet_size_x)), scan_dy(qvec_t::from(renderer.rt.camera.sampler.dy*packet_size_y));
	
			// compute offsets/patterns within a packet (condensed into a scanline offset).
			// preserve precision as much as possible.
			// { (0,0),  (1,0),
			//   (0,1),  (1,0) } + corner
			const __m128 
				offset_dx(addps(set1ps(float(corner.x)), andps((sse_t::select<0,1,0,1>().ps), cst::section.one.ps))),
				offset_dy(addps(set1ps(float(corner.y)), andps((sse_t::select<0,0,1,1>().ps), cst::section.one.ps)));
			// { (cx+0,cy+0),  (cx+1,cy+0),
			//   (cx+0,cy+1),  (cx+1,cy+1) } * (dx,dy) +  sampler.top
			qvec_t scan_line(
				addps(set1ps(renderer.rt.camera.sampler.top.x), 
					addps(
						mulps(sampler_dx.x.ps, offset_dx),
						mulps(sampler_dy.x.ps, offset_dy) ) ),
				addps(set1ps(renderer.rt.camera.sampler.top.y), 
					addps(
						mulps(sampler_dx.y.ps, offset_dx),
						mulps(sampler_dy.y.ps, offset_dy) ) ),
				addps(set1ps(renderer.rt.camera.sampler.top.z), 
					addps(
						mulps(sampler_dx.z.ps, offset_dx),
						mulps(sampler_dy.z.ps, offset_dy) ) ) );

			const ray_t ray(qvec_t::from(renderer.rt.camera.get_eye()));

			const __m128 color_scale(compute_color_scale(renderer.rt.scene.bounding_box.get_extent()));
			//const __m128 color_scale;

			const point_t &res(renderer.framebuffer.get_resolution());
			uint_t pixel_line = corner.y*res.x + corner.x;
			for (int dec_y=tile_size_y/packet_size_y; dec_y; --dec_y) {
				qvec_t scan(scan_line);
				uint_t pixel_idx = pixel_line;
				for (int dec_x=tile_size_x/packet_size_x; dec_x; --dec_x) {
					hit_t hit;
					ray.set_dir(scan);

					trace(renderer.rt.scene, ray, hit, trace_stack);

					#define shadeit(ridx) mulps(rcpps(splatps(hit.t.ps, (ridx))), color_scale)
					renderer.framebuffer.plot(pixel_idx+0 + res.x*0, shadeit(0));
					renderer.framebuffer.plot(pixel_idx+1 + res.x*0, shadeit(1));
					renderer.framebuffer.plot(pixel_idx+0 + res.x*1, shadeit(2));
					renderer.framebuffer.plot(pixel_idx+1 + res.x*1, shadeit(3));

					scan.x.ps = addps(scan.x.ps, scan_dx.x.ps); scan.y.ps = addps(scan.y.ps, scan_dx.y.ps); scan.z.ps = addps(scan.z.ps, scan_dx.z.ps);
					pixel_idx += 2;
				}

				scan_line.x.ps = addps(scan_line.x.ps, scan_dy.x.ps); scan_line.y.ps = addps(scan_line.y.ps, scan_dy.y.ps); scan_line.z.ps = addps(scan_line.z.ps, scan_dy.z.ps);
				pixel_line += 2*res.x;
			}
		}

		/*
			First stage for tracing a single ray packet:
			- intersect with the scene bounding box, if all missed exit else clip ray segments
			- if packet is incoherent do multiple traces grouping as many sub-rays as possible and disabling others each time
		*/
		void trace(const scene_t &scene, const ray_t &ray, hit_t &hit, trace_stack_t &stack) {
			ray_segment_t MM_ALIGN16 rs;
			const bool_t bbox_hit = intersect_ray_box_robust(scene.bounding_box, ray, rs); //packet_ray_box_intersect(scene.bounding_box, ray, rs);
			rs.t_near.ps = maxps(rs.t_near.ps, all_zero());
			
			enum { sign_value = 8 };
			if (bbox_hit) {
				const uint_t sx = movemask(ray.dir.x.ps), sy = movemask(ray.dir.y.ps), sz = movemask(ray.dir.z.ps);
				const bool_t coherent = 					
							((sx == 0)|(sx == 15)) &
							((sy == 0)|(sy == 15)) &
							((sz == 0)|(sz == 15));

				if (coherent) {
					ray.set_signs((sx == 0) ? sign_value : 0, (sy == 0) ? sign_value : 0, (sz == 0) ? sign_value : 0);

					trace_segment(scene, ray, hit, rs, stack);
				}
				else {
					if (bbox_hit) {
						ray_segment_t MM_ALIGN16  saved_rs;
						saved_rs.t_near.ps = rs.t_near.ps;
						saved_rs.t_far.ps = rs.t_far.ps;

						// with a bit of logic, iterate through the ray set
						// finding & tracing subsets with matching directions.
						//
						// also branchless on msvc, phew.
						uint_t idx = 0, done = 0;
						do {
							const uint_t 
								// extract signs for the selected ray
								idx_mask = 1u<<idx,
								bit_x = sx & idx_mask, bit_y = sy & idx_mask, bit_z = sz & idx_mask,
				
								// ray directions for this bundle
								rsx = !bit_x ? sign_value : 0, rsy = !bit_y ? sign_value : 0, rsz = !bit_z ? sign_value : 0,

								// find matching sign combo in other rays
								mask_x = bit_x ? ~0u : 0, mask_y = bit_y ? ~0u : 0, mask_z = bit_z ? ~0u : 0,

								no_match_x = sx^mask_x, no_match_y = sy^mask_y, no_match_z = sz^mask_z,

								exclude = no_match_x | no_match_y | no_match_z,
								include = ~exclude & 15u;
							
							// directions for the bundle
							ray.set_signs(rsx, rsy, rsz);

							// inactivate rays by setting up a bogus ray segment
							rs.t_near.ps = cond_moveps(cst::section.selection_multiple[include].ps,	cst::section.plus_inf.ps,	saved_rs.t_near.ps);
							rs.t_far.ps = cond_moveps(cst::section.selection_multiple[include].ps,	cst::section.minus_inf.ps,	saved_rs.t_far.ps);

							// trace that subset
							trace_segment(scene, ray, hit, rs, stack);

							// select the next ray from those not yet done
							// note: we only bsf 0 when done, so that's not an issue
							done |= include;
							idx = bit_scan_forward32(~done & 15u);
						} while (done < 15u);
					}
				}
			}	// bbox missed.
		}


		//
		// aabb ray intersection, fast & NaN oblivious version.
		//
		//FIX:
#if 0
		bool_t intersect_ray_box(const aabb_t &box, const ray_t &ray, ray_segment_t &rs) {
			__m128
				l1 = mulps(ray.inv_dir.x.v, subps(set1ps(box.get_min().x), ray.pos.x.v)),
				l2 = mulps(ray.inv_dir.x.v, subps(set1ps(box.get_max().x), ray.pos.x.v)),
				lmin = minps(l1,l2),
				lmax = maxps(l1,l2);

			l1 = mulps(ray.inv_dir.y.v, subps(set1ps(box.get_min().y), ray.pos.y.v));
			l2 = mulps(ray.inv_dir.y.v, subps(set1ps(box.get_max().y), ray.pos.y.v));
			lmin = maxps(minps(l1,l2), lmin);
			lmax = minps(maxps(l1,l2), lmax);

			l1 = mulps(ray.inv_dir.z.v, subps(set1ps(box.get_min().z), ray.pos.z.v));
			l2 = mulps(ray.inv_dir.z.v, subps(set1ps(box.get_max().z), ray.pos.z.v));
			lmin = maxps(minps(l1,l2), lmin);
			lmax = minps(maxps(l1,l2), lmax);

			const bool_t rc = !mask_all(orps(cmplt(lmax, all_zero()),cmpgt(lmin, lmax)));
			rs.t_near.v	= lmin; //rs.t_near.v	= maxps(lmin, all_zero());	// clamped to 0. more efficient here.
			rs.t_far.v	= lmax;

			return rc;
		}
#endif

		bool_t intersect_ray_box_robust(const aabb_t &box, const ray_t &ray, ray_segment_t &rs) {
			const __m128
				xl1 = mulps(ray.rcp_dir.x.ps, subps(set1ps(box.pmin.x), ray.pos.x.ps)),
				xl2 = mulps(ray.rcp_dir.x.ps, subps(set1ps(box.pmax.x), ray.pos.x.ps)),
				xl1a = minps(xl1, cst::section.plus_inf.ps),  xl2a = minps(xl2, cst::section.plus_inf.ps),
				xl1b = maxps(xl1, cst::section.minus_inf.ps), xl2b = maxps(xl2, cst::section.minus_inf.ps);

			__m128
				lmax = maxps(xl1a,xl2a),
				lmin = minps(xl1b,xl2b);

			const __m128
				yl1 = mulps(ray.rcp_dir.y.ps, subps(set1ps(box.pmin.y), ray.pos.y.ps)),
				yl2 = mulps(ray.rcp_dir.y.ps, subps(set1ps(box.pmax.y), ray.pos.y.ps)),
				yl1a = minps(yl1, cst::section.plus_inf.ps),  yl2a = minps(yl2, cst::section.plus_inf.ps),
				yl1b = maxps(yl1, cst::section.minus_inf.ps), yl2b = maxps(yl2, cst::section.minus_inf.ps);

			lmax = minps(maxps(yl1a,yl2a), lmax);
			lmin = maxps(minps(yl1b,yl2b), lmin);

			const __m128
				zl1 = mulps(ray.rcp_dir.z.ps, subps(set1ps(box.pmin.z), ray.pos.z.ps)),
				zl2 = mulps(ray.rcp_dir.z.ps, subps(set1ps(box.pmax.z), ray.pos.z.ps)),
				zl1a = minps(zl1, cst::section.plus_inf.ps),  zl2a = minps(zl2, cst::section.plus_inf.ps),
				zl1b = maxps(zl1, cst::section.minus_inf.ps), zl2b = maxps(zl2, cst::section.minus_inf.ps);

			lmax = minps(maxps(zl1a,zl2a), lmax);
			lmin = maxps(minps(zl1b,zl2b), lmin);

			const bool_t rc = !mask_all(orps(cmplt(lmax, all_zero()),cmpgt(lmin, lmax)));
			//rs.t_near.ps	= maxps(lmin, all_zero());	// clamped to 0. more efficient here.
			rs.t_near.ps	= lmin; 
			rs.t_far.ps	= lmax;

			return rc;
		}

		/*
			A version which gets fed an inactive ray mask.

		*/

		void intersect_ray_tri(const wald_tri_t &tri, const ray_t &ray, hit_t &hit, const __m128 inactive) {
			const uint_t k = tri.k, ku = cst::wald_modulo[k], kv = cst::wald_modulo[k+1];
			const __m128
				&dir_k(ray.dir[k]), &dir_ku(ray.dir[ku]), &dir_kv(ray.dir[kv]),
				&pos_k(ray.pos[k]), &pos_ku(ray.pos[ku]), &pos_kv(ray.pos[kv]);
			
			// lnd = (ray.dir[k] + tri.n_u*dir_ku + tri.n_v*dir_kv)
			// num = (tri.n_d - ray.pos[k] - tri.n_u*pos_ku - tri.n_v*pos_kv)
			// t = num / lnd
			const __m128
				pl0	 = loadps((const float *) &tri),
				n_u	 = splatps(pl0, 1), n_v = splatps(pl0, 2), n_d = splatps(pl0, 3),
				pl1	 = loadps(((const float *) &tri) + 4),
				vert_ku	= splatps(pl1, 0), vert_kv	= splatps(pl1, 1);
			
			const __m128	
				mul_lnd1 = mulps(n_u, dir_ku), mul_lnd2 = mulps(n_v, dir_kv),
				mul_num1 = mulps(n_u, pos_ku), mul_num2 = mulps(n_v, pos_kv),
				cst_num1 = subps(n_d, pos_k),
				lnd = addps(addps(mul_lnd1, dir_k), mul_lnd2),
				numerator = subps(subps(cst_num1, mul_num1), mul_num2),
				#ifndef NO_DIV_PS
					t = divps(numerator, lnd);
				#else
					t = approx::div_nr_ps(numerator, lnd);
				#endif

			// hu = pos_ku + t*dir_ku - tri.vert_ku
			// hv = pos_kv + t*dir_kv - tri.vert_kv
			const __m128
				cst_hu = subps(pos_ku, vert_ku), cst_hv = subps(pos_kv, vert_kv),
				mul_hu = mulps(t, dir_ku), mul_hv = mulps(t, dir_kv),
				hu = addps(mul_hu, cst_hu), hv = addps(mul_hv, cst_hv);

			const __m128
				pl2		= loadps(((const float *) &tri) + 8),
				b_nu	= splatps(pl1, 2), b_nv	= splatps(pl1, 3), c_nu	= splatps(pl2, 0), c_nv	= splatps(pl2, 1);
				
			// beta  = hv*tri.b_nu + hu*tri.b_nv;
			// gamma = hu*tri.c_nu + hv*tri.c_nv;
			const __m128
				beta = addps(mulps(hv, b_nu), mulps(hu, b_nv)), gamma= addps(mulps(hu, c_nu), mulps(hv, c_nv)),

				cond1		= cmpgt(t, 		hit.t.ps),
				cond2		= cmplt(t, 		all_zero()),
				cond3		= cmplt(beta,	all_zero()),
				cond4		= cmplt(gamma,	all_zero()),
				cond5		= cmpge(addps(beta,gamma), cst::section.one.ps),
				cond_mask	= orps(orps(orps(orps(cond1, cond2), cond3), cond4), cond5),
				umask		= orps(cond_mask, inactive);

			// could keep the hit structure entirely in registers
			// and only update once the loop is done.
			// but that puts too much pressure on the register allocator in 32bit mode.
			hit.t.ps = cond_moveps(umask, t,		hit.t.ps);
			hit.u.ps = cond_moveps(umask, beta,		hit.u.ps);
			hit.v.ps = cond_moveps(umask, gamma,	hit.v.ps);
			hit.id.pi = cond_movepi(castps2pi(umask), _mm_shuffle_epi32(castps2pi(pl2), _MM_SHUFFLE(2,2,2,2)),	hit.id.pi);
		}

		void intersect_all(const scene_t &scene, const int list, const int count, const ray_t &ray, hit_t &hit, const __m128 inactive) {
			const kdtree::tri_id_t * __restrict const ids = &scene.kdtree.ids[list];
			for (int_t i=0; i<count; ++i) {
				const uint32_t tid = ids[i];
				intersect_ray_tri(scene.kdtree.acc[tid], ray, hit, inactive);
			}
		}
		
		/*
			this version is a bit weird but works kinda ok with all 3 compilers.
		*/
		void trace_segment(const scene_t &scene, const ray_t &ray, hit_t &hit, ray_segment_t &rs, trace_stack_t &stack) {
			stack.reset();

			__m128 
				t_near	= rs.t_near.ps,
				t_far	= rs.t_far.ps,
				inactive = cmpgt(t_near, t_far),	// currently inactive rays.
				done = inactive;					// terminated rays.
			const kdtree::node_t * __restrict node = scene.kdtree.root;
			do {
				#define flip_node(n)		(const kdtree::node_t * __restrict) (((unsigned long) (n)) ^ 8u)
				goto case_1;
			case_2:
				node = flip_node(node);
			case_1:
				if (node->is_leaf()) goto leaf_found;
				{
					const __m128 split = _mm_shuffle_ps(_mm_load_ss(&node->inner.split_coord), _mm_load_ss(&node->inner.split_coord), 0);
					const uint_t bits = node->inner.dim_offset_flag, axis = bits & 3u;
					const __m128 d(mulps(ray.rcp_dir[axis], subps(split, ray.pos[axis])));
								
					const int
						frt = (movemask(orps(cmpgt(d, t_far),	inactive))),
						bck = (movemask(orps(cmplt(d, t_near),	inactive)));
					
					const bool_t frontside = (frt == 15), backside = (bck == 15);
						
					node = (const kdtree::node_t * __restrict) (((unsigned long)node + (bits & kdtree::node_t::mask_children) + ray.signs[axis])^8);

					if (EXPECT_TAKEN(frontside))	goto case_1;
					if (EXPECT_TAKEN(backside))		goto case_2;
					{
						const uint32_t idx = uint32_t(stack.idx++); 

						stack.nodes[idx]				= flip_node(node); // push left
						stack.segments[idx].t_near.ps	= maxps(d, t_near);
						stack.segments[idx].t_far.ps	= t_far;
					
						t_far = minps(d, t_far);
					
						inactive = orps(inactive, cmpgt(t_near, t_far));	// update that one too
						goto case_1;
					}
				}
				#undef flip_node
			leaf_found:

				// intersect-a-leaf
				if (EXPECT_NOT_TAKEN(node->leaf.count > 0)) {
					intersect_all(scene, node->get_list(), node->leaf.count, ray, hit, inactive);
					// update terminated rays.
					done = orps(done, cmple(hit.t.ps, t_far));
				}

				const int  mask = movemask(done), idx = --stack.idx;
				const bool_t again = !((idx < 0) | (mask == 15));
				if (again) {
					node = stack.nodes[idx];
					t_near = stack.segments[idx].t_near.ps;
					t_far  = stack.segments[idx].t_far.ps;

					inactive = orps(done, cmpgt(t_near, t_far));
				}
				else
					break;	// early termination
			} while (true); // forever
		}



	}
}
#include "horde.h"
namespace horde {

	NOINLINE void grunt_render_tiles_packet(const rt::renderer_t &renderer) {
		point_t point;
		while (horde::lifo.pop(point)) // food fighting
			rt::packet::do_render_tile(renderer, point);
	}
}

